<a href=""https://github.com/Sandeep-Desai/CS328-WrittenAssignment%3EGithub Link
from IPython.display import HTML
HTML('''<button type="button" class="btn btn-outline-danger" onclick="codeToggle();">Toggle Code</button>''')
import os
import pandas as pd
import requests
from tqdm import tqdm
import math
import plotly.graph_objs as go
import plotly.express as px
from collections import defaultdict, Counter
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
import gensim
from wordcloud import WordCloud
import imageio
from IPython.display import Image
import numpy as np
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import silhouette_samples, silhouette_score
import joblib
import matplotlib.pyplot as plt
import json
import warnings
warnings.filterwarnings('ignore')
We have used two different dataset from following sources:
For doing clustering we have used the NeurIPS dataset and for remaining analysis we have used the second dataset.
df = pd.read_csv('papers.csv')
df.columns
Index(['Conference', 'Year', 'Title', 'Author', 'Affiliation'], dtype='object')
The second dataset consists of 3 different conferences:
df_cleaned = pd.read_csv('df_cleaned.csv')
df_cleaned['Conference'].value_counts().plot(kind='bar',color=['red','blue','green'])
<AxesSubplot: >
data1 = pd.read_csv("issue_4/papers1.csv")
def get_top10(data, year):
temp = data[data.Year == year]
unique_occ = defaultdict(int)
unique_occ.update(Counter(temp['Affiliation']))
if 'None' in unique_occ:
del unique_occ['None']
unique_occ['MIT'] = unique_occ.pop('Massachusetts Institute of Technology', None)
unique_occ['CMU'] = unique_occ.pop('Carnegie Mellon University', None)
if 'Google DeepMind' in unique_occ:
unique_occ['DeepMind'] = unique_occ['Google DeepMind']
del unique_occ['Google DeepMind']
sorted_occ = sorted(unique_occ.items(), key=lambda x: x[1], reverse=True)[:10]
sorted_occ = [x for x in sorted_occ if str(x[0]) != 'nan']
top10 = [x[0] for x in sorted_occ]
top10_count = [x[1] for x in sorted_occ]
return top10, top10_count
top10, top10_count = get_top10(data1, 2006)
print(top10)
print(top10_count)
['Google', 'Stanford University', 'CMU', 'UC Berkeley', 'University of California, Irvine', 'Microsoft Research', 'MPI for Intelligent Systems, Tübingen', 'Princeton', 'ETH Zurich', 'National ICT Australia'] [14, 13, 13, 11, 9, 9, 7, 7, 7, 6]
fig = go.Figure()
year = 2006
top10, top10_count = get_top10(data1, year)
fig.add_trace(
go.Bar(
x=top10,
y=top10_count,
name="Year=" + str(year),
text=top10_count,
textposition='auto',
marker=dict(color=top10_count, colorscale='Viridis'),
width=0.9
)
)
slider_dict = {}
for i in range(16):
year = 2006 + i
top10, top10_count = get_top10(data1, year)
slider_dict[f'Year {year}'] = {
'label': str(year),
'method': 'update',
'args': [
{"x": [top10], "y": [top10_count], "name": ["Year=" + str(year)], "text": [top10_count]}
]
}
steps = []
for key, value in slider_dict.items():
step = dict(
method=value['method'],
args=value['args'],
label=value['label']
)
steps.append(step)
sliders = [
{
"active": 0,
"steps": steps,
"currentvalue": {"prefix": "Year: ", "font": {"size": 13}},
"len": 0.9,
"x": 0.1,
"y": 0,
"pad": {"t": 30, "b": 10}
}
]
fig.update_layout(
sliders=sliders,
xaxis_title="Institutes",
yaxis_title="Number of Papers",
title="Top 10 Institutes",
font=dict(size=13),
width=1500,
height=600,
xaxis=dict(
tickmode='linear',
dtick=1,
tickangle=0,
tickfont=dict(size=10)
)
)
fig.show()
The above plot displays the contribution of different institutions in publishing research papers in a particular year. The year can be changed using the slider. The top 10 institutes in the respective year are displayed along with the number of papers they have published. The number of papers published have increased from around 10 papers only in 2006 to about 200 papers in 2020. However, in the year 2021, the number of papers decreased considerably to approximately 100 due to the COVID crises. Certain institutions like Google, Stanford University, MIT, CMU, etc. are persistent in the top 10 thus, contributing significantly every year to the research in the field of AI. Also, DeepMind founded in 2010 (later acquired by Google in 2015 - named Google DeepMind then onwards) showed considerable progress through the years due to the deep learning boom in AI.
def preprocess(text):
tokens = simple_preprocess(text, deacc=True)
stopped_tokens = [t for t in tokens if not t in STOPWORDS and t != 'learn' and t != 'learning' and t != 'models']
return stopped_tokens
preprocessed_titles = {}
for year in data1['Year'].unique():
titles = data1[data1['Year'] == year]['Title']
preprocessed_titles[year] = [preprocess(title) for title in titles]
# Generate word clouds for each year and save them as separate images
wordclouds = []
for year in preprocessed_titles.keys():
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=100).generate_from_frequencies(dict(Counter([word for title in preprocessed_titles[year] for word in title])))
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"Year : {year}")
plt.savefig(f"issue_4/wordcloud_{year}.png")
wordclouds.append((year, wordcloud))
plt.close()
# Create the gif animation using imageio
images = [imageio.imread(f"wordcloud_{year}.png") for year in data1['Year'].unique()]
imageio.mimsave('issue_4/wordcloud_animation.gif', images, fps=2)
The gif above is the animation of wordclouds of titles of papers published per year. As seen in the wordclouds, the common words in the title before 2012 mostly have bayesian, naive, etc as the most prominent words. However, after 2012 due to the shift of the world towards neural networks and deep learning, the word clouds after 2012 display the words neural, networks, deep, etc as the most significant words.
universities = ['MIT', 'UC Berkeley', 'DeepMind', 'Stanford', 'CMU']
for university in universities:
uni_data = data1[data1['Affiliation'] == university]
preprocessed_titles = {}
for year in uni_data['Year'].unique():
titles = uni_data[uni_data['Year'] == year]['Title']
preprocessed_titles[year] = [preprocess(title) for title in titles]
wordclouds = []
for year in preprocessed_titles.keys():
wordcloud = WordCloud(width=800, height=400, background_color='white', colormap='viridis', max_words=100).generate_from_frequencies(dict(Counter([word for title in preprocessed_titles[year] for word in title])))
plt.figure(figsize=(12, 8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title(f"{university} - Year : {year}")
plt.savefig(f"issue_4/{university}_wordcloud_{year}.png")
wordclouds.append((year, wordcloud))
plt.close()
images = [imageio.imread(f"{university}_wordcloud_{year}.png") for year in uni_data['Year'].unique()]
imageio.mimsave(f'issue_4/{university}_wordcloud_animation.gif', images, fps=2)
![]() |
![]() |
![]() |
![]() |
![]() |
df = pd.read_csv('papers.csv')
unique_affiliations_df = df['Affiliation'].unique()
unique_affiliations_df = pd.DataFrame(unique_affiliations_df, columns=['Affiliation'])
# pd.DataFrame(unique_affiliations_df).to_csv('dump_country_analysis/unique_affiliations.csv')
unique_affiliations_df = pd.read_csv('dump_country_analysis/unique_affiliations.csv')
unique_affiliations_df['country'] = None
none_count = 0
for i in tqdm(range(len(unique_affiliations_df))):
try:
institute_name = unique_affiliations_df.loc[i, 'Affiliation']
url = f"https://nominatim.openstreetmap.org/search?q={institute_name}&format=json&accept-language=en"
response = requests.get(url)
if response.status_code == 200:
results = response.json()
if len(results) > 0:
country = results[0]['display_name'].split(',')[-1].strip()
unique_affiliations_df.loc[i, 'country'] = country
else:
unique_affiliations_df['country'][i] = 'None'
none_count += 1
else:
print(f"API ERROR")
except:
print(f"ERROR: {i}")
unique_affiliations_df['country'][i] = 'None'
none_count += 1
print(f"None count: {none_count}")
# pd.DataFrame(unique_affiliations_df).to_csv('dump_country_analysis/unique_affiliations.csv')
university_affiliation_df = pd.read_csv('dump_country_analysis/unique_affiliations.csv')
none_df = pd.read_csv('dump_country_analysis/none_df.csv')
university_affiliation_dict = {}
for i in range(len(university_affiliation_df)):
university_affiliation_dict[university_affiliation_df.loc[i, 'Affiliation']] = university_affiliation_df.loc[i, 'country']
for i in range(len(none_df)):
university_affiliation_dict[none_df.loc[i, 'Affiliation']] = none_df.loc[i, 'country']
papers = pd.read_csv('papers.csv')
papers['country'] = None
none_count = 0
for i in tqdm(range(len(papers))):
institute_name = papers.loc[i, 'Affiliation']
papers.loc[i, 'country'] = university_affiliation_dict[institute_name]
#save papers_with_country
# papers.to_csv('dump_country_analysis/papers_with_country.csv')
Using openstreetmap and elsevier, we were able to get the country of 93% of the contributions. The remaining 7% of the papers were not able to be found in the databases. We will be using the papers_with_country.csv for the rest of the analysis.
The dataset contains multiple rows for the same paper, each representing a contribution from an author's affiliation. To account for this, the number of contributions is used as a metric instead of the number of papers, with each row representing a unique contribution.
# Export the country count
# papers['country'].value_counts().to_csv('dump_country_analysis/country_count.csv')
The below code reads a dataset of papers with country information, calculates the number of contributions done by the top 15 countries for each year, and saves the results in two separate CSV files, one with the raw data and another with the logarithmic transformation of the data.
# read papers_with_country
papers = pd.read_csv('dump_country_analysis/papers_with_country.csv')
years = papers['Year'].unique()
#top 15 countries
top_15 = dict(papers['country'].value_counts()[:16])
#delete none
del top_15['None']
df_country_year = pd.DataFrame(columns=['country']+list(years))
#fill df_country_year
for country in top_15.keys():
df_country_year.loc[len(df_country_year)] = [country]+[0]*len(years)
for year in years:
df_country_year.loc[df_country_year['country'] == country, year] = len(papers[(papers['country'] == country) & (papers['Year'] == year)])
# df_country_year.to_csv('dump_country_analysis/country_year_contributions.csv')
#also save log data
df_country_year_log = df_country_year.copy()
for year in years:
df_country_year_log[year] = df_country_year_log[year].apply(lambda x: 0 if x == 0 else math.log(x))
# df_country_year_log.to_csv('dump_country_analysis/country_year_contributions_log.csv')
# Read the country_year_contributions_slope file
df = pd.read_csv('dump_country_analysis/country_year_contributions_log.csv')
df = df.set_index('country')
years = df.columns.values[1:]
data = []
for country in df.index:
country_data = df.loc[country]
trace = go.Scatter(x=years, y=country_data[1:], name=country)
data.append(trace)
layout = go.Layout(title='Country Data',
xaxis=dict(title='Year'),
yaxis=dict(title='Value'))
fig = go.Figure(data=data, layout=layout)
# fig.update_layout(yaxis_range=[8.20,8.40])
fig.show()
The line chart displays the trend of top 15 contributing countries in AI research from 2006 to 2021, with each country represented by a separate line. The y-axis represents the number of contributions and the x-axis represents the year, with the United States showing the highest number of contributions.
import pandas as pd
# data-E4Qdp.csv is the country_count.csv file with some country name changes
data = pd.read_csv('dump_country_analysis/data-E4Qdp.csv')
with open('dump_country_analysis/countries.geojson') as f:
geojson = f.read()
fig = px.choropleth(data, locations='Country', locationmode='country names',
color='Count', range_color=(0, max(data['Count'])),
title='Contributions by Countries', hover_name='Country')
fig.update_layout(geo=dict(showframe=False, showcoastlines=False, projection_type='equirectangular'),
margin=dict(l=0, r=0, t=50, b=0))
fig.show()
This choropleth map shows the number of AI paper contributions by country, with brighter colors representing higher numbers of contributions. The legend displays the range of values for each color. The United States has the highest number of contributions.
The below code calculates the slope of the number of contributions done by each country for each year, adjusts the values by adding the absolute value of the minimum slope, takes the logarithm of the adjusted values, and saves the results in a CSV file.
df_country_year = pd.read_csv('dump_country_analysis/country_year_contributions.csv')
columns = df_country_year.columns[1:]
df_country_year_slope = pd.DataFrame(columns=list(columns[:-1]))
minn_slope = 100000
for i in range(len(df_country_year)):
country = df_country_year.loc[i, 'country']
df_country_year_slope.loc[len(df_country_year_slope)] = [country]+[0]*(len(df_country_year_slope.columns)-1)
for j in range(1,len(columns)-1):
slope = df_country_year.loc[i, columns[j+1]] - df_country_year.loc[i, columns[j]]
df_country_year_slope.loc[df_country_year_slope['country'] == country, columns[j+1]] = slope
if slope < minn_slope:
minn_slope = slope
#add minn_slope to all values
for i in range(len(df_country_year_slope)):
for j in range(1,len(columns)):
df_country_year_slope.loc[i, columns[j]] += abs(minn_slope)
for year in df_country_year_slope.columns[1:]:
df_country_year_slope[year] = df_country_year_slope[year].apply(lambda x: 0 if x == 0 else math.log(x))
# df_country_year_slope.to_csv('dump_country_analysis/country_year_contributions_slope.csv')
df = pd.read_csv('dump_country_analysis/country_year_contributions_slope.csv')
df = df.set_index('country')
years = df.columns.values[1:]
data = []
for country in df.index:
country_data = df.loc[country]
trace = go.Scatter(x=years, y=country_data[1:], name=country)
data.append(trace)
layout = go.Layout(title='Rate of increase of contributions in logarithmic scale',
xaxis=dict(title='Year'),
yaxis=dict(title='Value'))
fig = go.Figure(data=data, layout=layout)
fig.update_layout(yaxis_range=[8.20,8.40])
fig.show()
The chart indicates that before 2015, most countries experienced a constant rate of growth in contributions, with the United States and United Kingdom having higher growth rates than the others. However, after 2021, the number of contributions decreased, resulting in lower growth rates for all countries. India had a constant growth rate till 2020, but it experienced a decrease in the number of contributions in 2021. In terms of AI development breakthroughs, it's worth noting that 2015 was a significant year for the field of artificial intelligence. This was the year when DeepMind's AlphaGo defeated Lee Sedol, a world champion in the ancient board game of Go.
Preprocess the text data for each row in the column title. Below function perform the cleaning of the text data. It removes the following:-
def clean_text_data(text, tokenizer):
# Load the stopwords
stop_words = set(stopwords.words("english"))
text = str(text).lower() # Convert to lowercase
text = re.sub(r"\s+", " ", text) # Remove multiple spaces in content
text = re.sub(r"\w+…|…", "", text) # Remove ... and ... at the end of the content
text = re.sub(r"(?<=\w)-(?=\w)", " ", text) # Replace dash between words with space
text = re.sub(f"[{re.escape(string.punctuation)}]", "", text) # Remove punctuation
tokens = tokenizer(text) # Get tokens from text
tokens = [t for t in tokens if not t in stop_words] # Remove stopwords
tokens = ["" if t.isdigit() else t for t in tokens] # Remove digits
tokens = [t for t in tokens if len(t) > 1] # Remove tokens with length <= 1
return tokens
Make a new column in the dataframe named "tokens" for storing the cleaned title data.
text_columns = ["title"]
# Load data
df_original = pd.read_csv('papers.csv')
# Make copy of original dataframe
df_copy = df_original.drop_duplicates().copy()
# Convert columns to string
for col in text_columns:
df_copy[col] = df_copy[col].astype(str)
# Make new column with preprocessed text as tokens
df_copy["tokens"] = df_copy["title"].map(lambda x: clean_text_data(x, word_tokenize))
# Remove duplicated after preprocessing from tokens column
df_copy = df_copy.drop_duplicates(subset=["tokens"])
# Filter out rows with empty tokens
non_empty_tokens = df_copy["tokens"].apply(lambda x: len(x) > 0)
df_copy = df_copy[non_empty_tokens]
# Select desired columns
df_copy = df_copy[["title", "tokens", "year", "abstract"]]
# Reset index
df_copy = df_copy.reset_index(drop=True)
# Get values of title and tokens
title_values = df_copy["title"].values
tokens_values = df_copy["tokens"].values
# SAVE Title and token values
# np.save("title_values.npy", title_values)
# np.save("tokens_values.npy", tokens_values)
# Save csv file
# df_copy.to_csv("Cleaned_DF.csv", index=False)
# tokens_values = np.load("tokens_values.npy", allow_pickle=True)
# Load .npy file
tokens_values = np.load("Dump/tokens_values.npy",allow_pickle= True)
This is the look of our dataframe after preprocessing the title data.
df_cleaned = pd.read_csv("Dump/Cleaned_DF.csv")
# Train word2vec model
model = Word2Vec(sentences=tokens_values, vector_size=100)
# Save model
# model.save("model.model")
# Get word vectors for each token in list_of_tokens
def vectorize(token_values, model):
features = []
for tokens in token_values:
zero_vector = np.zeros(model.vector_size)
vectors = []
for token in tokens:
if token in model.wv:
try:
vectors.append(model.wv[token])
except KeyError:
continue
if vectors:
vectors = np.asarray(vectors)
avg_vec = vectors.mean(axis=0)
features.append(avg_vec)
else:
features.append(zero_vector)
return features
vectorized_tokens = vectorize(tokens_values, model=model)
# Save vectorized tokens
# np.save("vectorized_tokens.npy", vectorized_tokens)
def mbkmeans_clusters(X, num_clusters, mb):
# X: Matrix of features.
# mb: Size of mini-batches.
mbkm = MiniBatchKMeans(n_clusters=num_clusters, batch_size=mb).fit(X)
print(f"Cluster value = {num_clusters}")
print(f"Silhouette coefficient: {silhouette_score(X, mbkm.labels_):0.2f}")
sample_silhouette_values = silhouette_samples(X, mbkm.labels_)
cluster_count = {}
for i in range(num_clusters):
cluster_match_label = sample_silhouette_values[mbkm.labels_ == i]
cluster_count[i] = cluster_match_label.shape[0]
print(f" Number of papers in Cluster{i} : {cluster_count[i]}")
return mbkm, mbkm.labels_
# Retrieve vectorized tokens
vectorized_tokens = np.load("Dump/vectorized_tokens.npy")
mbkmeans_model, cluster_labels = mbkmeans_clusters(X=vectorized_tokens, num_clusters = 10, mb=500)
# Save model
# joblib.dump(mbkmeans_model, "mbkmeans_model.pkl")
# Save cluster labels
# np.save("cluster_labels.npy", cluster_labels)
Cluster value = 10 Silhouette coefficient: 0.37 Number of papers in Cluster0 : 1461 Number of papers in Cluster1 : 1094 Number of papers in Cluster2 : 769 Number of papers in Cluster3 : 64 Number of papers in Cluster4 : 1367 Number of papers in Cluster5 : 1175 Number of papers in Cluster6 : 562 Number of papers in Cluster7 : 285 Number of papers in Cluster8 : 1320 Number of papers in Cluster9 : 1583
We chose the cluster size to be 10. We observed how the silhouette score changes with the cluster size. We found that the silhouette score is maximum for cluster size 10. The silhouette score is 0.38 which is a good score and very near to 1. This means clustering is performed at a good level.
# Load model and cluster labels
mbkmeans_model = joblib.load("Dump/mbkmeans_model.pkl")
cluster_labels = np.load("Dump/cluster_labels.npy")
# Load title and token values
title_values = np.load("Dump/title_values.npy", allow_pickle=True)
df_clusters = pd.DataFrame({
"title": title_values,
"tokens": [" ".join(text) for text in tokens_values],
"cluster": cluster_labels,
"year": df_cleaned["year"].values,
"title": df_cleaned["title"].values,
"abstract": df_cleaned["abstract"].values
})
# save csv file
# df_clusters.to_csv("Clustering_Results.csv", index=False)
df_clustering_results = pd.read_csv("Dump/Clustering_Results.csv")
We chose the cluster size to be 10. We observed how the silhouette score changes with the cluster size. We found that the silhouette score is maximum for cluster size 10. The silhouette score is 0.38 which is a good score and very near to 1. This means clustering is performed at a good level.
cluster_dict = {}
for i in range(10):
most_representative_tokens = np.argsort(np.linalg.norm(vectorized_tokens - mbkmeans_model.cluster_centers_[i], axis=1))
print(f"Cluster {i}", ":",title_values[most_representative_tokens[0]])
cluster_dict[i] = title_values[most_representative_tokens[0]]
Cluster 0 : Computation of Similarity Measures for Sequential Data using Generalized Suffix Trees Cluster 1 : Phase Transitions and Cyclic Phenomena in Bandits with Switching Constraints Cluster 2 : Identifying Alzheimer's Disease-Related Brain Regions from Multi-Modality Neuroimaging Data using Sparse Composite Linear Discrimination Analysis Cluster 3 : Hindsight Experience Replay Cluster 4 : Attend and Predict: Understanding Gene Regulation by Selective Attention on Chromatin Cluster 5 : Efficient Bayes-Adaptive Reinforcement Learning using Sample-Based Search Cluster 6 : Non-linear Prediction of Acoustic Vectors Using Hierarchical Mixtures of Experts Cluster 7 : AGEM: Solving Linear Inverse Problems via Deep Priors and Sampling Cluster 8 : Receptive Field Formation in Natural Scene Environments: Comparison of Single Cell Learning Rules Cluster 9 : Depth-First Proof-Number Search with Heuristic Edge Cost and Application to Chemical Synthesis Planning
def find_topWords_and_Summarize(i):
# Filter the rows by cluster label
cluster_df_0 = df_clustering_results[df_clustering_results['cluster'] == i]
# Concatenate the token lists
tokens = []
for row in cluster_df_0['tokens']:
tokens += row.split()
# Calculate the frequency distribution
freq_dist = FreqDist(tokens)
# print(freq_dist)
# Get the 10 most common words
top_words = [word for word, frequency in freq_dist.most_common(15)]
# Convert the list of words into a string
summary = " ".join(top_words)
print(f"Summary for Cluster {i}",summary)
for i in range(10):
find_topWords_and_Summarize(i)
Summary for Cluster 0 learning neural networks models using model bayesian based optimization via data deep efficient inference network Summary for Cluster 1 learning neural networks model using models based sparse optimal clustering classification network optimization multi adaptive Summary for Cluster 2 learning networks neural models using model data bayesian via estimation based network multi analysis deep Summary for Cluster 3 replay fairness hindsight experience recognizing languages music compute balancing observation neuromorphic regularizing mobile environment occams Summary for Cluster 4 learning sensitive gans ocular dominance computation competition processing silicon examples mechanisms kernel word automatic repeated Summary for Cluster 5 learning networks neural models model deep stochastic using network reinforcement bayesian optimization multi inference gaussian Summary for Cluster 6 learning neural networks models using via deep based model multi inference optimization network data analysis Summary for Cluster 7 learning neural networks models using model bayesian via deep data based optimization gaussian inference stochastic Summary for Cluster 8 learning neural networks using models data model network deep inference based sparse analysis time bayesian Summary for Cluster 9 learning networks neural model using models network via optimization deep based classification image optimal bayesian
Based on the summaries of the clusters, we can see that some common topics across the clusters include:
Learning neural networks using models, data, and deep techniques Bayesian-based optimization and inference Multi-model and multi-analysis approaches Stochastic processes and reinforcement learning Classification and image processing Sensitivity, fairness, and regularizing mechanisms Computation and processing in natural scenes and environments. These topics reflect some of the most popular and growing research areas in machine learning and artificial intelligence during the years analyzed.
pca = PCA(n_components=2)
pca_transformed = pca.fit_transform(vectorized_tokens)
plt.figure(figsize=(20, 10))
sns.scatterplot(x=pca_transformed[:,0], y=pca_transformed[:,1], hue=cluster_labels, legend='full', palette='Paired')
plt.title('PCA Visualization of Clusters')
plt.show()
plt.savefig('PCA Visualization of Clusters.png')
for key, value in cluster_dict.items():
print(f"Cluster {key}:", value)
Cluster 0: Computation of Similarity Measures for Sequential Data using Generalized Suffix Trees Cluster 1: Phase Transitions and Cyclic Phenomena in Bandits with Switching Constraints Cluster 2: Identifying Alzheimer's Disease-Related Brain Regions from Multi-Modality Neuroimaging Data using Sparse Composite Linear Discrimination Analysis Cluster 3: Hindsight Experience Replay Cluster 4: Attend and Predict: Understanding Gene Regulation by Selective Attention on Chromatin Cluster 5: Efficient Bayes-Adaptive Reinforcement Learning using Sample-Based Search Cluster 6: Non-linear Prediction of Acoustic Vectors Using Hierarchical Mixtures of Experts Cluster 7: AGEM: Solving Linear Inverse Problems via Deep Priors and Sampling Cluster 8: Receptive Field Formation in Natural Scene Environments: Comparison of Single Cell Learning Rules Cluster 9: Depth-First Proof-Number Search with Heuristic Edge Cost and Application to Chemical Synthesis Planning
<Figure size 640x480 with 0 Axes>
# Group the data by year and clusters
df_grouped = df_clustering_results.groupby(['year', 'cluster']).size().reset_index(name='counts')
# Create a list of colors for the clusters
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
'#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']
# Create a trace for each cluster
traces = []
for i in range(10):
trace = go.Bar(
x=df_grouped[df_grouped['cluster'] == i]['year'],
y=df_grouped[df_grouped['cluster'] == i]['counts'],
name=f'Cluster {i}',
marker=dict(color=colors[i])
)
traces.append(trace)
# Create the layout
layout = go.Layout(
title='Yearly distribution of papers by cluster',
xaxis=dict(title='Year'),
yaxis=dict(title='Count of papers for each Cluster'),
barmode='stack'
)
# Create the figure and plot it
fig = go.Figure(data=traces, layout=layout)
fig.show()
for key, value in cluster_dict.items():
print(f"Cluster {key}:", value)
Cluster 0: Computation of Similarity Measures for Sequential Data using Generalized Suffix Trees Cluster 1: Phase Transitions and Cyclic Phenomena in Bandits with Switching Constraints Cluster 2: Identifying Alzheimer's Disease-Related Brain Regions from Multi-Modality Neuroimaging Data using Sparse Composite Linear Discrimination Analysis Cluster 3: Hindsight Experience Replay Cluster 4: Attend and Predict: Understanding Gene Regulation by Selective Attention on Chromatin Cluster 5: Efficient Bayes-Adaptive Reinforcement Learning using Sample-Based Search Cluster 6: Non-linear Prediction of Acoustic Vectors Using Hierarchical Mixtures of Experts Cluster 7: AGEM: Solving Linear Inverse Problems via Deep Priors and Sampling Cluster 8: Receptive Field Formation in Natural Scene Environments: Comparison of Single Cell Learning Rules Cluster 9: Depth-First Proof-Number Search with Heuristic Edge Cost and Application to Chemical Synthesis Planning
Based on the given information, we can infer that the topics covered by Cluster 0, Cluster 6, and Cluster 8 were the most popular research areas during the years 2016-2019. These clusters had the most number of papers and were the most dense among all the clusters during this period.
Cluster 0 deals with the computation of similarity measures for sequential data using generalized suffix trees. This suggests that researchers were interested in developing new techniques for processing and analyzing sequential data. This could have important implications for various domains such as bioinformatics, natural language processing, and data mining.
Cluster 6 focuses on non-linear prediction of acoustic vectors using hierarchical mixtures of experts. This implies that researchers were exploring new ways to improve speech recognition systems, which is an important area of research given the growing demand for voice assistants and speech-to-text technologies.
Cluster 8 deals with receptive field formation in natural scene environments and the comparison of single cell learning rules. This suggests that researchers were interested in understanding how neural networks learn and process visual information. This could have important implications for developing more efficient and effective computer vision systems.
Overall, the analysis suggests that the research topics covered by Cluster 0, Cluster 6, and Cluster 8 were on growth during the years 2016-2019 and could have important implications for the development of new technologies and systems in the future.
df=pd.read_csv('papers.csv')
# dataframe for doing citations related analysis
df_citations=pd.DataFrame(columns=df.columns)
In the given dataset if a paper has multiple authors than it is repeated for each author. So, we are merging the papers with same title and creating a new dataset.
i=0
while i<len(df):
title=df.loc[i,'Title']
ct=i
conf=df.loc[i,'Conference']
year=df.loc[i,'Year']
authors=[]
aff=[]
while ct<len(df) and df.loc[ct,'Title']==title:
authors.append(df.loc[ct,'Author'])
aff.append(df.loc[ct,'Affiliation'])
i=i+1
ct=ct+1
df_citations.loc[len(df_citations)]=[conf,year,title,authors,aff]
Using Semantic Scholar API we are getting the citations for each paper from title. However due to limit on number of requests we can make to the API and time it consumes to get Id from the title and citations from id we are only getting citations for paper belonging to year 2006-2012.
error_rate=0
citation_graph=pd.DataFrame(columns=['PaperId','Cited_PaperId'])
for i in tqdm.trange(len(df_citations[df_citations['Year']<=2012])):
title=df_citations.loc[i,'Title']
try:
# getting id by title
query="http://api.semanticscholar.org/graph/v1/paper/search?query="+title
req=requests.get(query)
response=req.json()
id=response['data'][0]['paperId']
citation_graph.loc[i]=[id,[]]
# getting citations by id
search_by_id="https://api.semanticscholar.org/graph/v1/paper/"+id+"?fields=title,citations.authors,authors"
req=requests.get(search_by_id)
response=dic2=json.loads(req.content.decode('utf-8'))
for citation in response['citations']:
citation_graph.loc[i,'Cited_PaperId'].append(citation['paperId'])
except:
error_rate=error_rate+1
if(i%100==0):
print(error_rate/(i+1))
citation_graph.to_csv('citation_graph_2012.csv',index=False)
print(error_rate/len(df_citations[df_citations['Year']<=2012]))
# citation_graph.to_csv('citation_graph_2012.csv',index=False)
We were able to get citations for around 1400 papers from around 1800 papers belonging to year 2006-2012. The reason for not getting citations for all the papers is that some of the papers are not present in the Semantic Scholar database.
# citation_graph.to_csv('citation_graph_2012.csv',index=False)
# this file with used later to after mapping countries to papers
'paper_with_country.csv' already contains the country of each paper. We will map the title of paper to a country. However, in cases when a paper has multiple authors belonging to different countries we will assign majority country to the paper.
country = pd.read_csv('citation_data/papers_with_country.csv')
country_till_2012=country[country['Year']<=2012]
country_till_2012['PaperId']=np.nan
title_to_id={}
unique_titles=country_till_2012['Title'].unique()
error_rate=0
for i in tqdm.trange(len(unique_titles)):
title=unique_titles[i]
url="http://api.semanticscholar.org/graph/v1/paper/search?query="+title
try:
req=requests.get(url)
response=req.json()
id=response['data'][0]['paperId']
title_to_id[title]=id
except:
error_rate+=1
if(i%100==0):
print(i)
print(error_rate/(i+1))
print("-------------------")
print(error_rate/len(unique_titles))
# with open('title_to_id.json', 'w') as fp:
# json.dump(title_to_id, fp)
# load json file
with open('citation_data/title_to_id.json', 'r') as fp:
title_to_id = json.load(fp)
for i in tqdm(range(len(country_till_2012))):
if title in title_to_id:
country_till_2012['PaperId'][i]=title_to_id[title]
else:
country_till_2012['PaperId'][i]=np.nan
100%|██████████| 5564/5564 [00:00<00:00, 13419.84it/s]
# country_till_2012.to_csv('papers_with_country_id.csv',index=False)
country_till_2012=pd.read_csv('citation_data/papers_with_country_id.csv')
citation_graph=pd.read_csv('citation_data/citation_graph_2012.csv')
for i in tqdm(range(len(citation_graph))):
citation_graph['Cited_PaperId'][i]=eval(citation_graph['Cited_PaperId'][i])
citation_graph.head()
100%|██████████| 1414/1414 [00:00<00:00, 6274.65it/s]
| PaperId | Cited_PaperId | |
|---|---|---|
| 0 | 1cac3ee85e52cb0afde71f66472e35a315a4a112 | [ad82862b99028bf87eda454bf0f92788f19cdbac, 489... |
| 1 | 7227f224679969446d5363c7dbfd61bcdd4b338f | [c5c744b1609f473c882bcbd751eb13a72f13e7bc, f5a... |
| 2 | 167126a5c592ea0ac743228c5465b45eed3f0e84 | [784355019ffeec8bd01557fce49891bcf697e57c, 7a1... |
| 3 | 219a1eed0d61a144024a6d1cf487c116d1cfd017 | [fd1621615511d7dd035863c112757e67172da7b5, f86... |
| 4 | dc08a44d88826a80472c374a287a78376a167fd0 | [7ee1483b3169576c18ade8833fc555af2590aae1, a44... |
id_to_title={}
for title,id in title_to_id.items():
id_to_title[id]=title
citation_graph['Title']=np.nan
for i in range(len(citation_graph)):
id=citation_graph['PaperId'][i]
try:
title=id_to_title[id]
citation_graph['Title'][i]=title
except:
citation_graph['Title'][i]=np.nan
title_to_country={}
papers_with_country=pd.read_csv('citation_data/papers_with_country_id.csv')
for title in unique_titles:
country=papers_with_country[papers_with_country['Title']==title]['country'].values
# majority vote on title to country
country_count={}
for c in country:
# print(type(c))
if(c!='None'):
if c in country_count:
country_count[c]+=1
else:
country_count[c]=1
country_count=sorted(country_count.items(),key=lambda x:x[1],reverse=True)
if(len(country_count)==0):
title_to_country[title]='None'
else:
title_to_country[title]=country_count[0][0]
citation_graph['Country']=np.nan
for i in range(len(citation_graph)):
title=citation_graph['Title'][i]
try:
country=title_to_country[title]
citation_graph['Country'][i]=country
except:
citation_graph['Country'][i]=np.nan
Number of citations for each paper is calculated as follows:
citation_graph=pd.read_csv('citation_data/citation_graph_2012_with_country.csv')
citation_graph.head()# citation_graph.to_csv('citation_graph_2012_with_country.csv',index=False)
| PaperId | Cited_PaperId | Title | Country | |
|---|---|---|---|---|
| 0 | 1cac3ee85e52cb0afde71f66472e35a315a4a112 | ['ad82862b99028bf87eda454bf0f92788f19cdbac', '... | Multi-Task Feature Learning | France |
| 1 | 7227f224679969446d5363c7dbfd61bcdd4b338f | ['c5c744b1609f473c882bcbd751eb13a72f13e7bc', '... | Context Effects in Category Learning: An Inve... | United States |
| 2 | 167126a5c592ea0ac743228c5465b45eed3f0e84 | ['784355019ffeec8bd01557fce49891bcf697e57c', '... | Comparative Gene Prediction using Conditional ... | United States |
| 3 | 219a1eed0d61a144024a6d1cf487c116d1cfd017 | ['fd1621615511d7dd035863c112757e67172da7b5', '... | Causal inference in sensorimotor integration | United States |
| 4 | dc08a44d88826a80472c374a287a78376a167fd0 | ['7ee1483b3169576c18ade8833fc555af2590aae1', '... | Geometric entropy minimization (GEM) for anoma... | United States |
# reset index
citation_graph=citation_graph.reset_index(drop=True)
Conveting str into list (by default list is converted into str while saving in csv)
for i in tqdm(range(len(citation_graph))):
citation_graph.loc[i,"Cited_PaperId"]=eval(citation_graph.loc[i,'Cited_PaperId'])
citation_graph.head()
100%|██████████| 1414/1414 [00:00<00:00, 5310.14it/s]
| PaperId | Cited_PaperId | Title | Country | |
|---|---|---|---|---|
| 0 | 1cac3ee85e52cb0afde71f66472e35a315a4a112 | [ad82862b99028bf87eda454bf0f92788f19cdbac, 489... | Multi-Task Feature Learning | France |
| 1 | 7227f224679969446d5363c7dbfd61bcdd4b338f | [c5c744b1609f473c882bcbd751eb13a72f13e7bc, f5a... | Context Effects in Category Learning: An Inve... | United States |
| 2 | 167126a5c592ea0ac743228c5465b45eed3f0e84 | [784355019ffeec8bd01557fce49891bcf697e57c, 7a1... | Comparative Gene Prediction using Conditional ... | United States |
| 3 | 219a1eed0d61a144024a6d1cf487c116d1cfd017 | [fd1621615511d7dd035863c112757e67172da7b5, f86... | Causal inference in sensorimotor integration | United States |
| 4 | dc08a44d88826a80472c374a287a78376a167fd0 | [7ee1483b3169576c18ade8833fc555af2590aae1, a44... | Geometric entropy minimization (GEM) for anoma... | United States |
id_to_index={}
index_to_id={}
for i in range(len(citation_graph)):
# print(citation_graph['PaperId'][i])
id_to_index[citation_graph.loc[i,'PaperId']]=i
index_to_id[i]=citation_graph.loc[i,'PaperId']
graph={}
for i in range(len(citation_graph)):
graph[i]={'country':'None','Cited_PaperId':[]}
graph[i]['country']=citation_graph.loc[i,'Country']
for j in citation_graph.loc[i,'Cited_PaperId']:
if j in id_to_index:
graph[i]['Cited_PaperId'].append(id_to_index[j])
# save the graph as json
with open('citation_graph_2012.json', 'w') as fp:
json.dump(graph, fp)
While for most paper the number of times it is cited is 0, one reason could be that we are only considering papers belonging to year 2006-2012. So if a paper is getting cited after 2012 it will not be considered. Also we only considering citations amoung the dataset papers. So if a paper is cited by a paper not present in the dataset it will not be considered.
# count of papers which are cited by other papers
citation_count={}
for i in range(len(citation_graph)):
citation_count[i]=0
for j in citation_graph.loc[i,'Cited_PaperId']:
if j in id_to_index:
citation_count[i]=citation_count[i]+1
# sort the citation count
sorted_citation_count=sorted(citation_count.items(), key=lambda x: x[1],reverse=True)
# plot the citation count
plt.figure(figsize=(15,5))
# frequency of citation count
plt.hist(citation_count.values(),bins=100)
plt.xlabel('Citation count')
plt.title('Citation count of papers')
Text(0.5, 1.0, 'Citation count of papers')
Top 10 countries with most number of papers will be assigned a unique color and rest will be assigned a common color
top_10_countries=citation_graph['Country'].value_counts()[:10].index
top_10_countries
Index(['United States', 'United Kingdom', 'Germany', 'Canada', 'France',
'Italy', 'China', 'Israel', 'Australia', 'Switzerland'],
dtype='object')
Top 10 countries with most papers make up around 87% of the total papers
print(100*np.sum(citation_graph['Country'].value_counts()[:10])/len(citation_graph))
87.41159830268741
Size of node is proportional to number of times a paper is cited .Color of node is based on country of paper
from pyvis.network import Network
net=Network(notebook=True)
for node in graph:
group=graph[node]['country']
if group in top_10_countries:
# change str to index in top 10 countries
group=top_10_countries.get_loc(group)
else:
group=10
net.add_node(node,size=citation_count[node],group=group)
# add edges
for node in graph:
for edge in graph[node]['Cited_PaperId']:
net.add_edge(node,edge)
net.show_buttons(filter_=['physics'])
# net.save_graph('citation_data/citation_graph_2012_2.html')
net.show('citation_data/citation_graph_2012_2.html')
Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook. citation_data/citation_graph_2012_2.html
author_count={}
for i in range(len(df)):
author=df.loc[i,'Author']
author_count[author]=author_count.get(author,0)+1
# top 100 authors
sorted_author_count=sorted(author_count.items(),key=lambda x:x[1],reverse=True)
sorted_author_count
# save as json
with open('citation_data/author_count.json','w') as f:
json.dump(sorted_author_count,f)
top_100_authors=sorted_author_count[:100]
top_10_authors=top_100_authors[:10]
top_10_authors
[('Sergey Levine', 102),
('Yoshua Bengio', 89),
('Michael Jordan', 88),
('Pieter Abbeel', 76),
('Lawrence Carin', 76),
('Masashi Sugiyama', 63),
('Francis Bach', 63),
('Le Song', 61),
('Pradeep Ravikumar', 60),
('Remi Munos', 58)]
Getting the number of papers on which two authors have collaborated on all combinations of top 100 authors.
collobration=pd.DataFrame(columns=['Author1','Author2','Weight'])
for author1 in top_100_authors:
for author2 in top_100_authors:
if(author1!=author2):
# find number of papers they worked on together using title
title1=df[df['Author']==author1[0]]['Title']
title2=df[df['Author']==author2[0]]['Title']
common_papers=len(set(title1).intersection(set(title2)))
if(common_papers>0):
collobration.loc[len(collobration)]=[author1[0],author2[0],common_papers]
collobration=collobration.sort_values(by='Weight',ascending=False)
collobration.head()
# collobration.to_csv('citation_data/collobration.csv',index=False)
collobration=pd.read_csv('citation_data/collobration.csv')
# remove 1 from weight
collobration['Weight']=collobration['Weight']-1
# covnert int64 to int32
collobration['Weight']=collobration['Weight'].astype('int32')
from pyvis.network import Network
net = Network(notebook=True)
for author in top_100_authors:
net.add_node(author[0], size=author[1])
for i in range(len(collobration)):
net.add_edge(collobration.loc[i,'Author1'], collobration.loc[i,'Author2'],value=int(collobration.loc[i,'Weight']))
net.save_graph("citation_data/collobration2.html")
Warning: When cdn_resources is 'local' jupyter notebook has issues displaying graphics on chrome/safari. Use cdn_resources='in_line' or cdn_resources='remote' if you have issues viewing graphics in a notebook.
net.show('citation_data/collobration2.html')
citation_data/collobration2.html
References: